# -*- coding: utf-8 -*-

from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen
import re

htmlpath = '../htmlfiles/weather.html'
url = 'http://www.scientificamerican.com/'

# get a html file
with open(htmlpath, 'w') as f1:
    page = urlopen(url)
    print >> f1, page.read()
    f1.close()

# load html for next operation
with open(htmlpath, 'r') as f2:
    soup = BeautifulSoup(f2)
    beautext = soup.prettify()
    print beautext
    # regular search
    title = soup.title
    print 'title =', title.text
    # tree search
    scp = soup.html.body.div(recursive=False)[0].script
    print scp.text
    # findAll() method using lambda function
    result = soup.findAll(lambda tag: len(tag.name)==1 and not tag.attrs)
    print [_t.text for _t in result]
    # findAll() method using regular expression
    result1 = soup.findAll(text=re.compile(r"\.*href"))
    print [_t for _t in result1]
    f2.close()
    